/* Copyright 2013 Tobias Marschall
 *
 * This file is part of CLEVER.
 *
 * CLEVER is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * CLEVER is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with CLEVER.  If not, see <http://www.gnu.org/licenses/>.
 */


#ifndef GENOTYPER_H
#define GENOTYPER_H

#include <limits>

#include <bamtools/api/BamReader.h>

#include "Variation.h"
#include "GenotypeDistribution.h"
#include "OverlappingRegions.h"

class Genotyper {
public:
	typedef struct ref_and_alt_support_t {
		int ref_support;
		int alt_support;
		ref_and_alt_support_t() : ref_support(-1), alt_support(-1) {}
		ref_and_alt_support_t(int ref_support, int alt_support) : ref_support(ref_support), alt_support(alt_support) {}
		ref_and_alt_support_t& operator+=(const ref_and_alt_support_t& rhs) {
			this->ref_support += rhs.ref_support;
			this->alt_support += rhs.alt_support;
			return *this;
		}
		int coverage() const { return ref_support + alt_support; }
	} ref_and_alt_support_t;
	
	typedef struct read_error_probabilies_t {
		double p_fp;
		double p_fn;
		read_error_probabilies_t() : p_fp(1.0), p_fn(1.0) {}
		read_error_probabilies_t(double p_fp, double p_fn) : p_fp(p_fp), p_fn(p_fn) {}
	} read_error_probabilies_t;

	typedef struct variation_stats_t {
		ref_and_alt_support_t split_evidence;
		ref_and_alt_support_t insert_evidence;
		std::vector<GenotypeDistribution> read_gt_probabilities;
		variation_stats_t() : split_evidence(0,0), insert_evidence(0,0) {}
	} variation_stats_t;

	typedef struct trio_genotype_t {
		GenotypeDistribution::genotype_t mother;
		GenotypeDistribution::genotype_t father;
		GenotypeDistribution::genotype_t child;
		GenotypeDistribution mother_posterior;
		GenotypeDistribution father_posterior;
		GenotypeDistribution child_posterior;
		trio_genotype_t() : mother(GenotypeDistribution::ABSENT), father(GenotypeDistribution::ABSENT), child(GenotypeDistribution::ABSENT) {}
		trio_genotype_t(GenotypeDistribution::genotype_t mother, GenotypeDistribution::genotype_t father, GenotypeDistribution::genotype_t child) : mother(mother), father(father), child(child) {}
	} trio_genotype_t;

	typedef struct quartet_genotype_t {
		GenotypeDistribution::genotype_t mother;
		GenotypeDistribution::genotype_t father;
		GenotypeDistribution::genotype_t child1;
		GenotypeDistribution::genotype_t child2;
		GenotypeDistribution mother_posterior;
		GenotypeDistribution father_posterior;
		GenotypeDistribution child1_posterior;
		GenotypeDistribution child2_posterior;
		quartet_genotype_t() : mother(GenotypeDistribution::ABSENT), father(GenotypeDistribution::ABSENT), child1(GenotypeDistribution::ABSENT), child2(GenotypeDistribution::ABSENT) {}
		quartet_genotype_t(GenotypeDistribution::genotype_t mother, GenotypeDistribution::genotype_t father, GenotypeDistribution::genotype_t child1, GenotypeDistribution::genotype_t child2) : mother(mother), father(father), child1(child1), child2(child2) {}
	} quartet_genotype_t;

	typedef boost::unordered_set<std::string> read_name_set_t;

	typedef struct mean_and_stddev_t {
		double mean;
		double stddev;
		mean_and_stddev_t() : mean(std::numeric_limits<double>::quiet_NaN()), stddev(std::numeric_limits<double>::quiet_NaN()) {}
		mean_and_stddev_t(double mean, double stddev) : mean(mean), stddev(stddev) {}
	} mean_and_stddev_t;

	typedef boost::unordered_map<std::string,mean_and_stddev_t> readgroup_params_map_t;
private:
	const readgroup_params_map_t* readgroup_params;
	double isize_mean;
	double isize_stddev;
	double variant_prior;
	int mapq_threshold;
	int bam_window_size;
	bool use_insert_size;
	bool use_split_reads;
	int coverage_threshold;
	int masking_distance;
	int max_length_diff_split_split;
	int max_distance_split_split;
	int split_read_border_size;
	
	GenotypeDistribution compute_genotype(int support, const std::vector<read_error_probabilies_t>& read_probabilities) const;
public:
	/** Constructor for the case with one global mean and stddev, independent of read group. */
	Genotyper(double isize_mean, double isize_stddev, double variant_prior, int max_length_diff_split_split = 0, int max_distance_split_split = 0, int mapq_threshold = 0, int bam_window_size = 1000, bool use_insert_size = true, bool use_split_reads = true, int split_read_border_size = 10);
	
	/** Constructor for the case with read group specific mean and stddev. */
	Genotyper(const readgroup_params_map_t* readgroup_params, double variant_prior, int max_length_diff_split_split = 0, int max_distance_split_split = 0, int mapq_threshold = 0, int bam_window_size = 1000, bool use_insert_size = true, bool use_split_reads = true, int split_read_border_size = 10);
	
	/** Configure masking behavior; when region with coverage above threshold is encountered during genotyping,
	 *  the genotype ABSENT is returned and all positions with a distance <= masking_distance from
	 *  this variation is masked.
	 */
	void enableMasking(int coverage_threshold, int masking_distance);
	
	/** Extract statistics needed to compute a genotype from a BAM file. If masked_regions
	 *  are given and intersection of variation and these regions is not empty, 0 is returned.
	 *  When enableMasking was previously called, an additional region might be added to masked_regions
	 *  according to the made settings. When used_reads is not 0, only reads not in this set are used
	 *  and those reads used are added to the set.
	 */
	std::auto_ptr<variation_stats_t> extractVariationStats(const Variation& v, BamTools::BamReader& bam_reader, OverlappingRegions* masked_regions = 0, read_name_set_t* used_reads = 0) const;
	
	/** Given statistics on variation, computes the genotype distribution. 
	 *  @param raw_genotype: If not null, then the distribution without any use of priors is written to this record.
	 */
	std::auto_ptr<GenotypeDistribution> computeGenotype(const Variation& v, const variation_stats_t& stats, GenotypeDistribution* raw_genotype = 0) const;
	
	/** Medelian-Inheritance-aware genotyping of a trio, given the individual genotype distributions. */
	static std::auto_ptr<trio_genotype_t> genotypeTrio(const GenotypeDistribution& mother, const GenotypeDistribution& father, const GenotypeDistribution& child, double denovo_threshold);

	/** Medelian-Inheritance-aware genotyping of a quartet, given the individual genotype distributions. */
	static std::auto_ptr<quartet_genotype_t> genotypeQuartet(const GenotypeDistribution& mother, const GenotypeDistribution& father, const GenotypeDistribution& child1, const GenotypeDistribution& child2, double denovo_threshold, bool monozygotic = false);
};

#endif // GENOTYPER_H
